Part 1

Exercise 1

Function with requirements:

myfun <- function(y, m){ #Two parameters: A vector and sample size
  x <- sample(y, size = m) 
  s <- NULL
  d <- NULL
  for (i in x) {
    if ((i <= 150) && (i %% 4 == 0)) { #if value is <= 150 and div. 4
       s <- c(s, i) #append to end of vector
    }
    else if ((i > 150) && (i %% 3 == 0)) {
       d <- c(d, i) #append to end of vector
    }
  }
  
  l <- list("Sample" = x, "s" = s, "d" = d, "Mean of s" = mean(s), "Var of s" = var(s),
            "Mean of d" = mean(d), "Var of d" = var(d))
  #lists are great! we can add all the names we need to the list
  #so the output looks (somewhat) nice!
  return(l)
}

vec <- c(1:300) #creating vector of natural numbers 1-300

myfun(vec, 40)
## $Sample
##  [1]  10 102  77  16 130 112 193 114 125 183  78 103  38 104  12  65  74 283  20
## [20] 288 225  90 291 229 278  13  51 255  55 159 172 214 244  93 242  15  40  68
## [39]  58  82
## 
## $s
## [1]  16 112 104  12  20  40  68
## 
## $d
## [1] 183 288 225 291 255 159
## 
## $`Mean of s`
## [1] 53.14286
## 
## $`Var of s`
## [1] 1769.143
## 
## $`Mean of d`
## [1] 233.5
## 
## $`Var of d`
## [1] 2982.3

Exercise 2

Normal with mean = 10, sd = 10: black
Chi-Sq. 5 df: green
Chi-Sq. 8 df: red
Chi-Sq. 18 df: blue

x <- seq(-5, 40, length = 500) #x range
dist.x <- dnorm(x, 10, 10) #normal mean = 10, variance = 10

degf <- c(5, 8, 18)
colors <- c("green","red", "blue", "black")  # Set up the four colors
labels <- c("df=5", "df = 8", "df = 18",  "normal")       # Label the plots

plot(x, dist.x, ylim=c(0, 0.165), type="l", lty=2, lwd=2, xlab="x value",
     ylab="Density", main= "Comparison of Normal and Chi-square Distributions")
    # Plot normal density curve with Title'
    # I added the y limit so we can see all the peaks of the graphs!

for (i in 1:length(degf)){
  lines(x, dchisq(x, degf[i]), lty=1, lwd=2, col=colors[i]) # Plot chi-square
  #distribution curves on plot
}

legend("topright", inset=0.01, title="Distributions", # Set up the title of legend
       labels, lwd=2, lty=c(1, 1, 1, 2), col=colors)  # Set up line types appearing in the legend

Part 2

Exercise 3

Head of each year:

#importing dataset
library(data.table)
library(dplyr)

weather <- fread("weather.csv")

#converting Date to proper type
weather$Date <- as.Date(weather$Date, format = "%m/%d/%Y")
#arranging with respect to date (dplyr)
weather <- weather %>% arrange(Date) 

library(knitr)
library(kableExtra)
heads <- rbind(head(weather[format(weather$Date, "%Y") == 2020, ], 5),
                       head(weather[format(weather$Date, "%Y") == 2021, ], 5),
                       head(weather[format(weather$Date, "%Y") == 2022, ], 5))
Each Page is the head of another year
Page 1 = head(2020)
Page 2 = head(2021)
Page 3 = head(2022)

Ridgeplots:

#import libraries
library(ggplot2)
library(ggridges)

#Temps:

#get subsets
Temp_sd <- weather[weather$`Parameter Name` == "Outdoor Temperature", ]
#rename arithmetic mean column to temp
colnames(Temp_sd)[17] = "Mean Temperature"

theme_update(plot.title = element_text(hjust = 0.5))
#this sets the default for our plots to center the titles, which is amazing
#because we don't have to add that little line of code to all of the
#plots for this project

Temps <- ggplot(Temp_sd, aes(x = `Mean Temperature`, y = as.factor(format(Date, "%Y/%m")), fill = stat(x))) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c(name = "Temp. [F]", option = "C") +
  labs(title = 'Average Temps in SD 2020-2022',
       y = "Year/Month")

The next code segments are nearly identical, so they are auto hidden. To view them, click the show button on the strip below.

#Wind Speed

WindSpd_sd <- weather[weather$`Parameter Name` == "Wind Speed - Resultant", ]
colnames(WindSpd_sd)[17] = "Mean Wind Speed"

WindSpd <- ggplot(WindSpd_sd, aes(x = `Mean Wind Speed`, y = as.factor(format(Date, "%Y/%m")), fill = stat(x))) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c(name = "Wind Spd. (knots)", option = "C") +
  labs(title = 'Average Wind Speed (knots) in SD 2020-2022',
       y = "Year/Month")

#Rel Humidity

RelHDP_sd <- weather[weather$`Parameter Name` == "Relative Humidity", ]
colnames(RelHDP_sd)[17] = "Mean % Humidity"

Humidity <- ggplot(RelHDP_sd, aes(x = `Mean % Humidity`, y = as.factor(format(Date, "%Y/%m")), fill = stat(x))) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c(name = "% Humidity", option = "C") +
  labs(title = 'Average % Humidity in SD 2020-2022',
       y = "Year/Month")

#Ozone

Ozone_sd <- weather[weather$`Parameter Name` == "Ozone", ]
colnames(Ozone_sd)[17] = "Mean Ozone"

Ozone <- ggplot(Ozone_sd, aes(x = `Mean Ozone`, y = as.factor(format(Date, "%Y/%m")), fill = stat(x))) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c(name = "Ozone (ppb)", option = "C") +
  labs(title = 'Average Ozone Concentration (ppb) in SD 2020-2022',
       y = "Year/Month")

#SO2

SO2_sd <- weather[weather$`Parameter Name` == "Sulfur dioxide", ]
colnames(SO2_sd)[17] = "Mean SO2"

SO2 <- ggplot(SO2_sd, aes(x = `Mean SO2`, y = as.factor(format(Date, "%Y/%m")), fill = stat(x))) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c(name = "SO2 (ppb)", option = "C") +
  labs(title = 'Average Sulfur Dioxide Concentration (ppb) in SD 2020-2022',
       y = "Year/Month")

#NO2 

NO2_sd <- weather[weather$`Parameter Name` == "Nitrogen dioxide (NO2)", ]
colnames(NO2_sd)[17] = "Mean NO2"

NO2 <- ggplot(NO2_sd, aes(x = `Mean NO2`, y = as.factor(format(Date, "%Y/%m")), fill = stat(x))) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c(name = "NO2 (ppb)", option = "C") +
  labs(title = 'Average Nitrogen Dioxide Concentration (ppb) in SD 2020-2022',
       y = "Year/Month")

#CO

CO_sd <- weather[weather$`Parameter Name` == "Carbon monoxide", ]
colnames(CO_sd)[17] = "Mean CO"

CO <- ggplot(CO_sd, aes(x = `Mean CO`, y = as.factor(format(Date, "%Y/%m")), fill = stat(x))) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c(name = "CO (ppm)", option = "C") +
  labs(title = 'Average Carbon Monoxide Concentration (ppm) in SD 2020-2022',
       y = "Year/Month")

Exercise 4

Our data for each respective variable (temp, wind, humidity, ozone, SO2, NO2, CO) is already neatly sorted from our last problem, so all we need to do is create ggplot boxplots for each respective dataframe (Temp_sd, Wind_sd, RelHDP_sd, Ozone_sd, SO2_sd, NO2_sd, CO_sd) for each year by creating yearly subsets:

#Creating the subsets for Temps

#Take a subset of the rows which have a year == desired_year
#format() syntax required for dates
Temp_sd2020 <- Temp_sd[format(Temp_sd$Date, "%Y") == 2020,]
Temp_sd2021 <- Temp_sd[format(Temp_sd$Date, "%Y") == 2021,]
Temp_sd2022 <- Temp_sd[format(Temp_sd$Date, "%Y") == 2022,]

The rest of the subset process is identical and is hidden. If you’d like to view it, click the Show button

#wind Speed

WindSpd_sd2020 <- WindSpd_sd[format(WindSpd_sd$Date, "%Y") == 2020,]
WindSpd_sd2021 <- WindSpd_sd[format(WindSpd_sd$Date, "%Y") == 2021,]
WindSpd_sd2022 <- WindSpd_sd[format(WindSpd_sd$Date, "%Y") == 2022,]


#humidity
RelHDP_sd2020 <- RelHDP_sd[format(RelHDP_sd$Date, "%Y") == 2020, ]
RelHDP_sd2021 <- RelHDP_sd[format(RelHDP_sd$Date, "%Y") == 2021, ]
RelHDP_sd2022 <- RelHDP_sd[format(RelHDP_sd$Date, "%Y") == 2022, ]

#ozone
Ozone_sd2020 <- Ozone_sd[format(Ozone_sd$Date, "%Y") == 2020, ]
Ozone_sd2021 <- Ozone_sd[format(Ozone_sd$Date, "%Y") == 2021, ]
Ozone_sd2022 <- Ozone_sd[format(Ozone_sd$Date, "%Y") == 2022, ]

#SO2
SO2_sd2020 <- SO2_sd[format(SO2_sd$Date, "%Y") == 2020, ]
SO2_sd2021 <- SO2_sd[format(SO2_sd$Date, "%Y") == 2021, ]
SO2_sd2022 <- SO2_sd[format(SO2_sd$Date, "%Y") == 2022, ]

#NO2
NO2_sd2020 <- NO2_sd[format(NO2_sd$Date, "%Y") == 2020, ]
NO2_sd2021 <- NO2_sd[format(NO2_sd$Date, "%Y") == 2021, ]
NO2_sd2022 <- NO2_sd[format(NO2_sd$Date, "%Y") == 2022, ]

#CO
CO_sd2020 <- CO_sd[format(CO_sd$Date, "%Y") == 2020, ]
CO_sd2021 <- CO_sd[format(CO_sd$Date, "%Y") == 2021, ]
CO_sd2022 <- CO_sd[format(CO_sd$Date, "%Y") == 2022, ]

Creating the boxplot for temps:

#Only weird thing here is the scale_discrete. I included this so
#the axis labels would be actual month names instead of the numbers

Temp_sd2020Box <- ggplot(Temp_sd2020, aes(x = format(Date, "%m"), 
                 y = `Mean Temperature`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average Temps (F) 2020 SD", x = "Month")

#I AM SO HAPPY THE MONTH.ABB THING WORKED WITH THE DISCRETE LABELS!!!

once again the code for the rest just changes the variables, so it is auto-hidden

#rest of the temps boxes
Temp_sd2021Box <- ggplot(Temp_sd2021, aes(x = format(Date, "%m"), 
                 y = `Mean Temperature`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Temps (Faren.) 2021", x = "Month")

Temp_sd2022Box <- ggplot(Temp_sd2022, aes(x = format(Date, "%m"), 
                                       y = `Mean Temperature`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Temps (Faren.) 2022", x = "Month")

#wind speed

WindSpd_sd2020Box <- ggplot(WindSpd_sd2020, aes(x = format(Date, "%m"), 
                   y = `Mean Wind Speed`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Wind Speeds (knots) 2020", x = "Month")

WindSpd_sd2021Box <- ggplot(WindSpd_sd2021, aes(x = format(Date, "%m"), 
                    y = `Mean Wind Speed`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Wind Speeds (knots) 2021", x = "Month")

WindSpd_sd2022Box <- ggplot(WindSpd_sd2022, aes(x = format(Date, "%m"), 
                   y = `Mean Wind Speed`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Wind Speeds (knots) 2022", x = "Month")

#Humidity

RelHDP_sd2020Box <- ggplot(RelHDP_sd2020, aes(x = format(Date, "%m"), 
                   y = `Mean % Humidity`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota % Humidity 2020", x = "Month")

RelHDP_sd2021Box <- ggplot(RelHDP_sd2021, aes(x = format(Date, "%m"), 
                   y = `Mean % Humidity`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota % Humidity 2021", x = "Month")

RelHDP_sd2022Box <- ggplot(RelHDP_sd2022, aes(x = format(Date, "%m"), 
                   y = `Mean % Humidity`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota % Humidity 2022", x = "Month")

#ozone

Ozone_sd2020Box <- ggplot(Ozone_sd2020, aes(x = format(Date, "%m"), 
                  y = `Mean Ozone`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Ozone Concentration (ppb) 2020", x = "Month")

Ozone_sd2021Box <- ggplot(Ozone_sd2021, aes(x = format(Date, "%m"), 
                  y = `Mean Ozone`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Ozone Concentration (ppb) 2021", x = "Month")

Ozone_sd2022Box <- ggplot(Ozone_sd2022, aes(x = format(Date, "%m"), 
                  y = `Mean Ozone`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Ozone Concentration (ppb) 2022", x = "Month")

#SO2

SO2_sd2020Box <- ggplot(SO2_sd2020, aes(x = format(Date, "%m"), 
                y = `Mean SO2`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Sulfur Dioxide Concentration (ppb) 2020", x = "Month")

SO2_sd2021Box <- ggplot(SO2_sd2021, aes(x = format(Date, "%m"), 
                y = `Mean SO2`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Sulfur Dioxide Concentration (ppb) 2021", x = "Month")

SO2_sd2022Box <- ggplot(SO2_sd2022, aes(x = format(Date, "%m"), 
                y = `Mean SO2`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Sulfur Dioxide Concentration (ppb) 2022", x = "Month")

#NO2
NO2_sd2020Box <- ggplot(NO2_sd2020, aes(x = format(Date, "%m"), 
                y = `Mean NO2`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Nitrogen Dioxide Concentration (ppb) 2020", x = "Month")

NO2_sd2021Box <- ggplot(NO2_sd2021, aes(x = format(Date, "%m"), 
                y = `Mean NO2`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Nitrogen Dioxide Concentration (ppb) 2021", x = "Month")

NO2_sd2022Box <- ggplot(NO2_sd2022, aes(x = format(Date, "%m"), 
                y = `Mean NO2`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Nitrogen Dioxide Concentration (ppb) 2022", x = "Month")

#CO

CO_sd2020Box <- ggplot(CO_sd2020, aes(x = format(Date, "%m"), 
                y = `Mean CO`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Carbon Monoxide Concentration (ppm) 2020", x = "Month")

CO_sd2021Box <- ggplot(CO_sd2021, aes(x = format(Date, "%m"), 
                y = `Mean CO`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Carbon Monoxide Concentration (ppm) 2021", x = "Month")

CO_sd2022Box <- ggplot(CO_sd2022, aes(x = format(Date, "%m"), 
                                      y = `Mean CO`, fill = format(Date, "%m"))) +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = month.abb[1:12]) +
  scale_x_discrete(labels = month.abb[1:12]) +
  labs(title = "Average South Dakota Carbon Monoxide Concentration (ppm) 2022", x = "Month")

Note about CO 2022. The Data for that set is incomplete, which is why it only goes to August.

Exercise 5

Interactive boxplot for meteorological data. We created the yearly boxplots in the last problem, so all we need to do is simply use the ggplotly the boxplots:

#ggplotly function converts ggplot directly to interactive plotly plot
ggplotly(Temp_sd2020Box)
ggplotly(Temp_sd2021Box)
ggplotly(Temp_sd2022Box)

The rest are redundant, click Show to see code

#wind speed
ggplotly(WindSpd_sd2020Box)
ggplotly(WindSpd_sd2021Box)
ggplotly(WindSpd_sd2022Box)

#humidity
ggplotly(RelHDP_sd2020Box)
ggplotly(RelHDP_sd2021Box)
ggplotly(RelHDP_sd2022Box)

Interactive Meteorlogical Boxplots:

Exercise 6

Counts of avg less than 32 and high greater than 80 for each year:
#exercise 6: counting number of occurences
#(results asis for kable)
library(kableExtra)

#freezing (I'm gonna use %>% for practice)

supercold2020 <- Temp_sd2020 %>%
  select(`Mean Temperature`) %>% #select only the avg temp column
  filter(`Mean Temperature` < 32) %>% #filter the column
  mutate(Year = 2020) %>% #add an identifyer column for the year
  mutate(Type = "avg below freezing") %>% #identifyer column for type of count
  select(Year, Type) #we can select just the year and type because we are only
                     #concerned with the counts!

#man I really like dyplr!! I was skeptical at first, but this stuff is great!

supercold2021 <- Temp_sd2021 %>%
  select(`Mean Temperature`) %>%
  filter(`Mean Temperature` < 32) %>%
  mutate(Year = 2021) %>%
  mutate(Type = "avg below freezing") %>%
  select(Year, Type)

supercold2022 <- Temp_sd2022 %>%
  select(`Mean Temperature`) %>%
  filter(`Mean Temperature` < 32) %>%
  mutate(Year = 2022) %>%
  mutate(Type = "avg below freezing") %>%
  select(Year, Type)


#high above 80:
hot_hot_HOT2020 <- Temp_sd2020 %>%
  select(`1st Max Value`) %>%
  filter(`1st Max Value` > 80) %>%
  mutate(Year = 2020) %>%
  mutate(Type = "HI. hotter than 80") %>%
  select(Year, Type)

hot_hot_HOT2021 <- Temp_sd2021 %>%
  select(`1st Max Value`) %>%
  filter(`1st Max Value` > 80) %>%
  mutate(Year = 2021) %>%
  mutate(Type = "HI. hotter than 80") %>%
  select(Year, Type)

hot_hot_HOT2022 <- Temp_sd2022 %>%
  select(`1st Max Value`) %>%
  filter(`1st Max Value` > 80) %>%
  mutate(Year = 2022) %>%
  mutate(Type = "HI. hotter than 80") %>%
  select(Year, Type)


temp_counts_df <- rbind(supercold2020, supercold2021, supercold2022,
                        hot_hot_HOT2020, hot_hot_HOT2021, hot_hot_HOT2022)

counts_of_temps <- table(temp_counts_df)

counts_of_temps %>% #using kable just to make table look a little nicer
  knitr::kable(caption = 
                 "Counts of Daily Avg and Hi. Temps in SD 2020-2022") %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "striped"), full_width = FALSE)
Counts of Daily Avg and Hi. Temps in SD 2020-2022
avg below freezing HI. hotter than 80
2020 171 159
2021 126 130
2022 165 94

Exercise 7

First rows of Summer and Winter

Summer <- subset(weather, format(Date, "%m") %in% c('06', '07', '08'))
#%in% operator lets us use a vector to return logical values for the subset!
Winter <- subset(weather, format(Date, "%m") %in% c('01', '02', '12'))

Summer <- Summer %>% mutate(Season = "Summer")
Winter <- Winter %>% mutate(Season = "Winter")

#select a few particular columns to print to neat table
SeasonalHeads <- rbind(head(Summer, 10), head(Winter, 10)) %>%
                  select(Season, Date, `Parameter Name`, `Arithmetic Mean`,
                         `Units of Measure`)
SeasonalHeads %>%
  knitr::kable(caption = "First few rows of Summer and Winter") %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "striped", "bordered"), full_width = TRUE) %>%
  pack_rows("Summer", 1, 10, color = "darkorange") %>%
  pack_rows("Winter", 11, 20, color = "dodgerblue")
First few rows of Summer and Winter
Season Date Parameter Name Arithmetic Mean Units of Measure
Summer
Summer 2020-06-01 Relative Humidity 48.791667 Percent relative humidity
Summer 2020-06-01 Outdoor Temperature 76.208333 Degrees Fahrenheit
Summer 2020-06-01 Outdoor Temperature 75.625000 Degrees Fahrenheit
Summer 2020-06-01 Wind Speed - Resultant 6.125000 Knots
Summer 2020-06-01 Wind Direction - Resultant 222.250000 Degrees Compass
Summer 2020-06-01 Wind Speed - Resultant 6.691667 Knots
Summer 2020-06-01 Wind Direction - Resultant 210.362500 Degrees Compass
Summer 2020-06-01 Wind Speed - Resultant 5.116667 Knots
Summer 2020-06-01 Wind Direction - Resultant 232.820833 Degrees Compass
Summer 2020-06-01 Carbon monoxide 0.187333 Parts per million
Winter
Winter 2020-01-01 Relative Humidity 52.541667 Percent relative humidity
Winter 2020-01-01 Relative Humidity 87.750000 Percent relative humidity
Winter 2020-01-01 Outdoor Temperature 36.458333 Degrees Fahrenheit
Winter 2020-01-01 Outdoor Temperature 33.000000 Degrees Fahrenheit
Winter 2020-01-01 Wind Speed - Resultant 3.825000 Knots
Winter 2020-01-01 Wind Direction - Resultant 267.541667 Degrees Compass
Winter 2020-01-01 Wind Speed - Resultant 7.866667 Knots
Winter 2020-01-01 Wind Direction - Resultant 140.808333 Degrees Compass
Winter 2020-01-01 Wind Speed - Resultant 3.512500 Knots
Winter 2020-01-01 Wind Direction - Resultant 225.066667 Degrees Compass

Dimensions

#Delete season column so dimension is proper
Summer <- Summer %>% select(!Season)
Winter <- Winter %>% select(!Season)

#creating table of the dimensions
Dimensions <- data.frame(
  Season = c("Summer", "Winter"), #creating col of vector in order
  "num records" = c(dim(Summer)[1], dim(Winter)[1]), #select col one of dim()
  "num variables" = c(dim(Summer)[2], dim(Winter)[2])
)

Dimensions %>%
  knitr::kable(caption = "Dimenions of Summer and Winter Subsets") %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "striped"), full_width = FALSE, position = "center")
Dimenions of Summer and Winter Subsets
Season num.records num.variables
Summer 7360 29
Winter 7654 29

Exercise 8

#daily wind for summer

#creating subsets so problem is easier

summer2020 <- Summer %>% 
  filter(format(Date, "%Y") == 2020)
summer2021 <- Summer %>% 
  filter(format(Date, "%Y") == 2021)
summer2022 <- Summer %>% 
  filter(format(Date, "%Y") == 2022)

winter2020 <- Winter %>%
  filter(format(Date, "%Y") == 2020)
winter2021 <- Winter %>%
  filter(format(Date, "%Y") == 2021)
winter2022 <- Winter %>%
  filter(format(Date, "%Y") == 2022)

#we're actually gonna bypass creating two vectors for each one
#with the use of dplyr's summarise. We can calculate the mean using
#summarise because it will automatically divide total winds / numrecords

#filter winds, select mean col vector, apply summarise(mean())
avgwinds_summer2020 <- summer2020 %>%
  filter(`Parameter Name` == "Wind Speed - Resultant") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric() #convert dataframe to single number

avgwinds_winter2020 <- winter2020 %>%
  filter(`Parameter Name` == "Wind Speed - Resultant") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

#this strategy also cleans the data while we calculate everything
#because we set na.rm to true for calculating the mean

Rest of the code auto-hidden

#rest of the winds
avgwinds_summer2021 <- summer2021 %>%
  filter(`Parameter Name` == "Wind Speed - Resultant") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

avgwinds_winter2021 <- winter2021 %>%
  filter(`Parameter Name` == "Wind Speed - Resultant") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

avgwinds_summer2022 <- summer2022 %>%
  filter(`Parameter Name` == "Wind Speed - Resultant") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

avgwinds_winter2022 <- winter2022 %>%
  filter(`Parameter Name` == "Wind Speed - Resultant") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

#temps

avgtemps_summer2020 <- summer2020 %>%
  filter(`Parameter Name` == "Outdoor Temperature") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

avgtemps_winter2020 <- winter2020 %>%
  filter(`Parameter Name` == "Outdoor Temperature") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

avgtemps_summer2021 <- summer2021 %>%
  filter(`Parameter Name` == "Outdoor Temperature") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

avgtemps_winter2021 <- winter2021 %>%
  filter(`Parameter Name` == "Outdoor Temperature") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

avgtemps_summer2022 <- summer2022 %>%
  filter(`Parameter Name` == "Outdoor Temperature") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

avgtemps_winter2022 <- winter2022 %>%
  filter(`Parameter Name` == "Outdoor Temperature") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

#humidity

avghumperc_summer2020 <- summer2020 %>%
  filter(`Parameter Name` == "Relative Humidity") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

avghumperc_winter2020 <- winter2020 %>%
  filter(`Parameter Name` == "Relative Humidity") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

avghumperc_summer2021 <- summer2021 %>%
  filter(`Parameter Name` == "Relative Humidity") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

avghumperc_winter2021 <- winter2021 %>%
  filter(`Parameter Name` == "Relative Humidity") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()


avghumperc_summer2022 <- summer2022 %>%
  filter(`Parameter Name` == "Relative Humidity") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

avghumperc_winter2022 <- winter2022 %>%
  filter(`Parameter Name` == "Relative Humidity") %>%
  select(`Arithmetic Mean`) %>%
  summarise(mean(`Arithmetic Mean`, na.rm=TRUE)) %>%
  as.numeric()

Creating and printing the table

#just follow the pattern of combining the vectors for each col in order of
#summer2020, summer2021, summer2022, winter2020, ...
tableframe <- data.frame(
  "season-year" = c("Summer-2020", "Summer-2021", "Summer-2022",
                    "Winter-2020", "Winter-2021", "Winter-2022"),
  "avg temps"=c(avgtemps_summer2020, avgtemps_summer2021, avgtemps_summer2022,
                avgtemps_winter2020, avgtemps_winter2021, avgtemps_winter2022),
  "avg windSpd"=c(avgwinds_summer2020, avgwinds_summer2021, avgwinds_summer2022,
                avgwinds_winter2020, avgwinds_winter2021, avgwinds_winter2022),
  "avg humidity"=c(avghumperc_summer2020, avghumperc_summer2021, 
                     avghumperc_summer2022, avghumperc_winter2020,
                     avghumperc_winter2021, avghumperc_winter2022)

)

tableframe %>%
  knitr::kable(caption = "Seasonal Averages for Daily Avg Temps, Winds, and Humidity %") %>%
  kableExtra::kable_styling(bootstrap_options = c("hover", "striped", "bordered"), full_width = TRUE) %>%
  pack_rows("Summer", 1, 3, color = "darkorange") %>%
  pack_rows("Winter", 4, 6, color = "dodgerblue")
Seasonal Averages for Daily Avg Temps, Winds, and Humidity %
season.year avg.temps avg.windSpd avg.humidity
Summer
Summer-2020 72.32543 4.806303 50.04484
Summer-2021 74.55649 4.849363 59.71301
Summer-2022 73.71784 5.500201 44.55435
Winter
Winter-2020 30.10824 4.986248 68.11296
Winter-2021 25.91245 5.029815 69.85231
Winter-2022 23.86079 6.884367 64.27558

Exercise 9

Daily differences of High and Mean Temperatures for the Summers of 2020-2022

#I'm going to do my general filtering and selecting that I have done before
#and then mutate an extra column which is the vector difference of
#hi and mean
tempdifs2020 <- summer2020 %>%
  filter(`Parameter Name` == "Outdoor Temperature") %>%
  mutate("season-year" = "summer-2020") %>% #appending a group col
  mutate("hi - avg" = `1st Max Value` - `Arithmetic Mean`) %>%
  select("season-year", Date, "hi - avg") %>%
  rename("date" = Date) %>%
  arrange(`hi - avg`) #sort ascending order by delta

tempdifs2021 <- summer2021 %>%
  filter(`Parameter Name` == "Outdoor Temperature") %>%
  mutate("season-year" = "summer-2021") %>% #appending a group col
  mutate("hi - avg" = `1st Max Value` - `Arithmetic Mean`) %>%
  select("season-year", Date, "hi - avg") %>%
  rename("date" = Date) %>%
  arrange(`hi - avg`) #sort ascending order by delta

tempdifs2022 <- summer2022 %>%
  filter(`Parameter Name` == "Outdoor Temperature") %>%
  mutate("season-year" = "summer-2022") %>% #appending a group col
  mutate("hi - avg" = `1st Max Value` - `Arithmetic Mean`) %>%
  select("season-year", Date, "hi - avg") %>%
  rename("date" = Date) %>%
  arrange(`hi - avg`) #sort ascending order by delta

#append heads to a dataframe
tableframe2 <- rbind(head(tempdifs2020, 10),
                     head(tempdifs2021, 10),
                     head(tempdifs2022, 10))

The code for the table formatting and kable manipulation is auto-hidden as it is a bit of an eyesore, but you are more than welcome to view it with the Show button

#knit data frame to neat kable table

tableframe2 %>%
  knitr::kable(caption = "Heads of Daily Differences of Hi and Avg. Temps for the Summers of 2020-2022 in SD") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  pack_rows("Summer 2020", 1, 10, color = 'sienna') %>%
  pack_rows("Summer 2021", 11, 20, color = 'darkorange') %>%
  pack_rows("Summer 2022", 21, 30, color = 'sandybrown') %>%
  row_spec(c(1,3,5,7,9), color = 'gray', background= 'bisque') %>%
  row_spec(c(2,4,6,8,10), color = 'gray15', background='burlywood') %>%
  row_spec(c(11,13,15,17,19), color = 'gray', background='antiquewhite') %>%
  row_spec(c(12,14,16,18,20), color = 'gray15', background='peachpuff') %>%
  row_spec(c(21, 23, 25, 27, 29), color = 'gray', background='blanchedalmond') %>%
  row_spec(c(22,24,26,28,30), color = 'gray15', background='wheat') %>%
  column_spec(3, color = 'black')
Heads of Daily Differences of Hi and Avg. Temps for the Summers of 2020-2022 in SD
season-year date hi - avg
Summer 2020
summer-2020 2020-07-29 4.583333
summer-2020 2020-07-18 4.916667
summer-2020 2020-06-02 5.125000
summer-2020 2020-06-22 5.291667
summer-2020 2020-06-01 5.375000
summer-2020 2020-06-30 5.500000
summer-2020 2020-06-01 5.791667
summer-2020 2020-06-02 5.875000
summer-2020 2020-06-21 6.083333
summer-2020 2020-08-28 6.458333
Summer 2021
summer-2021 2021-07-07 3.666667
summer-2021 2021-07-10 3.791667
summer-2021 2021-06-26 4.833333
summer-2021 2021-06-20 4.958333
summer-2021 2021-07-30 5.250000
summer-2021 2021-08-07 5.291667
summer-2021 2021-08-20 5.333333
summer-2021 2021-08-20 6.333333
summer-2021 2021-08-12 6.541667
summer-2021 2021-06-24 6.583333
Summer 2022
summer-2022 2022-08-25 3.416667
summer-2022 2022-07-26 4.708333
summer-2022 2022-07-23 4.833333
summer-2022 2022-06-30 6.208333
summer-2022 2022-08-31 6.250000
summer-2022 2022-06-05 6.458333
summer-2022 2022-06-01 6.791667
summer-2022 2022-08-06 7.333333
summer-2022 2022-08-15 7.416667
summer-2022 2022-06-11 7.500000

Part 3

Exercise 10

Here is the link to my Shiny App: Shiny App

I also have the code here, but it’s auto-hidden. If you’d like to see the code I used to build the shiny app, click Show

library(shiny)
library(shinydashboard)
library(plotly)
library(dplyr)
library(plyr)
library(data.table)

#reading in weather like before
weather <- fread("weather.csv")
weather$Date <- as.Date(weather$Date, format = "%m/%d/%Y")
weather <- weather %>% arrange(Date)

weather2020 <- weather %>% 
  filter(format(Date, "%Y") == 2020)

weather2021 <- weather %>% 
  filter(format(Date, "%Y") == 2021)

weather2022 <- weather %>% 
  filter(format(Date, "%Y") == 2022)


variable_mapping <- list(
  "Temperature" = 62101,
  "Wind Speed" = 61103,
  "Humidity" = 62201,
  "Ozone" = 44201,
  "Sulfur Dioxide" = 42401,
  "Nitrogen Dioxide" = 42602,
  "Carbon Monoxide" = 42101
)

reverse_mapping <- list(
  "62101" = "Temperature",
  "61103" = "Wind Speed",
  "62201" = "Humidity",
  "44201" = "Ozone",
  "42401" = "Sulfur Dioxide",
  "42602" = "Nitrogen Dioxide",
  "42101" = "Carbon Monoxide"
)

data_list <- list(
  "2020" = weather2020,
  "2021" = weather2021,
  "2022" = weather2022
)

ui <- navbarPage(
  title = "Interactive Yearly Weather Histograms",
  sidebarLayout(
    sidebarPanel(
      width = 3,
      h1("Select Year and Weather Variable"),
      selectInput(
        inputId = "year_choice",
        label = "Select Year",
        choices = c("2020", "2021", "2022")
      ),
      selectInput(
        inputId = "var_choice",
        label = "Select Variable",
        choices = c("Temperature", "Wind Speed", "Humidity", 
                    "Ozone", "Sulfur Dioxide", "Nitrogen Dioxide",
                    "Carbon Monoxide")
      ),
      sliderInput(
        inputId = "bins",
        label = "Number of Bins:",
        min = 5,
        max = 50,
        value = 30
      )
    ),
    mainPanel(
      h1("Histogram"),
      plotOutput("histogram", height = 700)
    )
  )
)

server <- function(input, output) {
  
  output$histogram <- renderPlot({
    selected_year <- input$year_choice
    selected_variable <- variable_mapping[[input$var_choice]]
    
    data <- data_list[[selected_year]] %>%
      filter(`Parameter Code` == selected_variable) %>%
      select(`Arithmetic Mean`)
    
    # I was getting weird bugs, so I just created a whole new column
    # so that histogram would be passed in a proper numeric vector
    #each time
    data$Arithmetic_Mean <- as.numeric(data$`Arithmetic Mean`)
    xlim <- c(min(data$Arithmetic_Mean), max(data$Arithmetic_Mean))
    
    bins <- seq(xlim[1], xlim[2], length.out = input$bins + 1)
    
    hist(data$Arithmetic_Mean, breaks = bins, col = "lightsteelblue", border = "midnightblue",
         xlab = reverse_mapping[[as.character(selected_variable)]],
         main = paste0("Histogram of ", reverse_mapping[[as.character(selected_variable)]], " for ", selected_year))
  })
}



shinyApp(ui, server)

This marks the end of my final exam. I thoroughly enjoyed the challenge you put together for us to close off the semester. I feel WAY more comfortable with R now after writing this exam. Thank you for the semester!